gapminder <- read_csv(file="gapminder_clean.csv")
gapminder <- as_tibble(gapminder)
filtered_data <- filter(gapminder, Year==1962, continent != "NA", `Country Name`!= "Kuwait")
ggplotly(
ggplot(data = filtered_data) +
geom_point(mapping = aes(x = `CO2 emissions (metric tons per capita)`,
y = `gdpPercap`, color=`continent`,
label = `Country Name`,
size=pop))
, tooltip="all")
pearson <- cor.test(filtered_data$`CO2 emissions (metric tons per capita)`,
filtered_data$`gdpPercap`,
method = "pearson")
cat("Pearson correlation of CO2 emissions (metric tons per capita) and gdpPercap, year 1962:\n","Correlation value: ", pearson[["estimate"]][["cor"]], "\n", "p-value: ", pearson[["p.value"]])
## Pearson correlation of CO2 emissions (metric tons per capita) and gdpPercap, year 1962:
## Correlation value: 0.8063295
## p-value: 1.082225e-25
unfiltered_data <- filter(gapminder, Year!=1962, `Country Name`!= "Kuwait", continent != "NA", `gdpPercap` != "NA",`CO2 emissions (metric tons per capita)` != "NA") %>%
group_by(Year) %>%
summarise(COR=cor(`CO2 emissions (metric tons per capita)`,`gdpPercap`, method = "pearson")) %>%
arrange(desc(COR)) %>%
slice_head(n=1)
unfiltered_data
filtered_data <- filter(gapminder, Year==1972, continent != "NA", `Country Name`!= "Kuwait")
ggplotly(
ggplot(data = filtered_data) +
geom_point(mapping = aes(x = `CO2 emissions (metric tons per capita)`,
y = `gdpPercap`, color=`continent`,
label = `Country Name`,
size=pop))
, tooltip="all")
energy_by_continent <- filter(gapminder,`continent` != "NA", `Energy use (kg of oil equivalent per capita)` != "NA") %>%
group_by(continent) %>%
summarize(
mean = mean(`Energy use (kg of oil equivalent per capita)`)
)
print(energy_by_continent)
## # A tibble: 5 x 2
## continent mean
## <chr> <dbl>
## 1 Africa 699.
## 2 Americas 1704.
## 3 Asia 1867.
## 4 Europe 3146.
## 5 Oceania 3980.
ggplotly(
ggplot(data = filter(gapminder,`continent` != "NA")) +
geom_boxplot(mapping = aes(y = `Energy use (kg of oil equivalent per capita)`,x = `continent`, fill=continent))
)
I need to know if the data satisfy parametric requirements to use parametric tests.
First, I separate the “Energy use” data per continent, (I’ll have five series, one per each continent), and I remove missing values to avoid errors on tests.
americas_energy <- filter(gapminder, continent=="Americas", `Energy use (kg of oil equivalent per capita)`!= "NA")
africa_energy <- filter(gapminder, continent=="Africa", `Energy use (kg of oil equivalent per capita)`!= "NA")
asia_energy <- filter(gapminder, continent=="Asia", `Energy use (kg of oil equivalent per capita)`!= "NA")
europe_energy <- filter(gapminder, continent=="Europe", `Energy use (kg of oil equivalent per capita)`!= "NA")
oceania_energy <- filter(gapminder, continent=="Oceania", `Energy use (kg of oil equivalent per capita)`!= "NA")
shapiro_america <- shapiro.test(americas_energy$`Energy use (kg of oil equivalent per capita)`)
shapiro_africa <- shapiro.test(africa_energy$`Energy use (kg of oil equivalent per capita)`)
shapiro_asia <- shapiro.test(asia_energy$`Energy use (kg of oil equivalent per capita)`)
shapiro_europe <- shapiro.test(europe_energy$`Energy use (kg of oil equivalent per capita)`)
shapiro_oceania <- shapiro.test(oceania_energy$`Energy use (kg of oil equivalent per capita)`)
cat("P-values: \n",
"Americas: ",shapiro_america[["p.value"]], " Reject null hypotesis","\n",
"Oceania: ",shapiro_oceania[["p.value"]],"Can not reject null hypotesis","\n",
"Africa: ",shapiro_africa[["p.value"]]," Reject null hypotesis","\n",
"Europe: ",shapiro_europe[["p.value"]]," Reject null hypotesis","\n",
"Asia: ",shapiro_asia[["p.value"]]," Reject null hypotesis")
## P-values:
## Americas: 1.586854e-21 Reject null hypotesis
## Oceania: 0.9552661 Can not reject null hypotesis
## Africa: 2.334331e-19 Reject null hypotesis
## Europe: 2.999487e-12 Reject null hypotesis
## Asia: 4.943111e-19 Reject null hypotesis
Shapiro-Wilk tests: rejected 4/5 null hypotheses. Data is not normal distributed.
with(gapminder, leveneTest(`Energy use (kg of oil equivalent per capita)`, continent))
Leneve test: rejected null hypothesis.
Shapiro-Wilk and Leneve tests were rejected. Data doesn’t satisfy parametric requeriments. So I need to use non-parametric tests.
The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal.
kruskal.test(`Energy use (kg of oil equivalent per capita)` ~ continent, data = gapminder)
##
## Kruskal-Wallis rank sum test
##
## data: Energy use (kg of oil equivalent per capita) by continent
## Kruskal-Wallis chi-squared = 318.68, df = 4, p-value < 2.2e-16
Kruskal-Wallis H-test: rejected null hypothesis.
So I have to compare the means to detect the differences and similarities between continents.
Post hoc pairwise test for multiple comparisons of mean rank sums. This test is run after Kruskal-Wallis’s one-way analysis of variance by ranks to do pairwise comparisons.
dunn_test(gapminder, `Energy use (kg of oil equivalent per capita)` ~ continent)
Asia’s and Americas’ energy use means are similar.
Oceania’s and Europe’s energy use means are similar.
europe_and_asia_after_1990 <- filter(gapminder,
`Country Name`!= "Singapore",
`continent` == "Asia" | `continent` == "Europe",
Year > 1990,
`Imports of goods and services (% of GDP)` < 97
& `Imports of goods and services (% of GDP)` != "NA" )
ggplot(data = europe_and_asia_after_1990) +
geom_boxplot(mapping = aes(x = `Imports of goods and services (% of GDP)`,
y = `continent`, fill=continent))
europe_imports <- filter(europe_and_asia_after_1990, continent=="Europe")
asia_imports <- filter(europe_and_asia_after_1990, continent=="Asia")
shapiro.test(europe_imports$`Imports of goods and services (% of GDP)`)
##
## Shapiro-Wilk normality test
##
## data: europe_imports$`Imports of goods and services (% of GDP)`
## W = 0.92905, p-value = 1.363e-05
shapiro.test(asia_imports$`Imports of goods and services (% of GDP)`)
##
## Shapiro-Wilk normality test
##
## data: asia_imports$`Imports of goods and services (% of GDP)`
## W = 0.9723, p-value = 0.04314
with(europe_and_asia_after_1990, leveneTest(`Imports of goods and services (% of GDP)`, continent))
Like the anterior case, parametric requirements are not satisfied. I need to compare two means using a non-parametric test.
The Mann-Whitney U test is used to compare differences between two independent groups when the dependent variable is either ordinal or continuous, but not normally distributed.
wilcox.test(`Imports of goods and services (% of GDP)` ~ continent, data=europe_and_asia_after_1990)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Imports of goods and services (% of GDP) by continent
## W = 5251, p-value = 0.8053
## alternative hypothesis: true location shift is not equal to 0
Can not reject the null hypothesis of identical average scores.
pop_density <- select(gapminder, `Country Name`, Year, continent,
`Population density (people per sq. km of land area)`) %>%
filter(`Country Name`!= "NA", `Population density (people per sq. km of land area)` != "NA")
group_by(pop_density,`Country Name`) %>%
summarise(mean = mean(`Population density (people per sq. km of land area)`)) %>%
arrange(desc(mean)) %>%
slice_head(n=5)
ggplotly(
ggplot(data = pop_density) +
geom_line(mapping = aes(x = `Year`, group = `Country Name`,
color=`Country Name`,
y = `Population density (people per sq. km of land area)`)) +
theme_bw()+
theme(legend.position="bottom") +
theme(legend.position = "none")
)
I’ll extract the first and the last record that contains the “Life expectancy at birth” value for each country (not the minimum and maximum values), I’ll subtract the last minus the first value and then I’ll calculate the relative increment, in percentage:
relative increment (%) = (last record - first record)/first record * 100
life_expectancy_by_country <- select(gapminder, `Country Name`, Year, `Life expectancy at birth, total (years)`) %>%
filter(`Country Name`!= "NA", `Life expectancy at birth, total (years)` != "NA") %>%
group_by(`Country Name`) %>%
summarise(
`Life expectancy. Last record.(years)` = last(`Life expectancy at birth, total (years)`),
`Absolute difference (years)` = last(`Life expectancy at birth, total (years)`) - first(`Life expectancy at birth, total (years)`),
`Relative difference, (%)` = (last(`Life expectancy at birth, total (years)`)
- first(`Life expectancy at birth, total (years)`))
/ first(`Life expectancy at birth, total (years)`) * 100) %>%
arrange(desc(`Relative difference, (%)`))
slice_head(life_expectancy_by_country, n=5)
ggplotly(
ggplot(data = life_expectancy_by_country) +
(aes(size=`Absolute difference (years)`,
y=`Life expectancy. Last record.(years)`,
x=`Relative difference, (%)`,
fill=`Country Name`)) +
geom_point(alpha=0.5, shape=21) +
scale_size(range = c(.1, 10),) +
theme_bw() +
theme(legend.position="bottom") +
theme(legend.position = "none")
)